FIGUREPATH <- "~/who_ferritin_comment/results/figures/"
BOOTPATH <- "~/who_ferritin_comment/results/boot/"
DATAPATH <- "~/who_ferritin_comment/data/"
MODELPATH <- "~/who_ferritin_comment/results/models/"
knots <- 3:25
response <- "Hemoglobin"
zoom <- T
boot_n <- 1000Ferritin ID Thresholds
Libraries
source("~/who_ferritin_comment/src/functions.R")
library(dplyr)
library(rms)
library(mcp)
library(boot)
library(scam)Load data
# Load data
# Naming assumptions:
# Ferritin, Hemoglobin, sTfR
data <- readRDS(paste0(DATAPATH, "NHANES_variables_filtered.rds"))Test RCS model selection
(If plots are too small due to layout settings, you may change the chunk option layout-ncol: 4 to 3, 2, or 1. You may also view these plots by accessing your FIGUREPATH, or by right-clicking and selecting Open image in new tab.)
Visualize RCS knots
data_full <- data
data <- data %>% select(Ferritin, as.name(response))
# set datadist
dd <- datadist(data)
options(datadist = "dd")
investigate_RCS(data, response, knots, zoom, FIGUREPATH)Bootstrap knot selection
Select optimum number of knots via bootstrapping and AIC
# The function boot_knot_selection is defined in functions.R
if (!file.exists(paste0(BOOTPATH, response, "_rcs_knots_", boot_n, "_bootobj.rds"))) {
boot_knots <- boot(data = data, statistic = boot_knot_selection, R = boot_n, response = response, knots = knots)
boot_knot_ci <- boot.ci(boot_knots, type = "perc") # BCa not computable on discrete data with this tight interval
saveRDS(boot_knots, paste0(BOOTPATH, response, "_rcs_knots_", boot_n, "_bootobj.rds"))
saveRDS(boot_knot_ci, paste0(BOOTPATH, response, "_rcs_knot_ci_", boot_n, "_boot_ci.rds"))
} else {
boot_knots <- readRDS(paste0(BOOTPATH, response, "_rcs_knots_", boot_n, "_bootobj.rds"))
boot_knot_ci <- readRDS(paste0(BOOTPATH, response, "_rcs_knot_ci_", boot_n, "_boot_ci.rds"))
}
boot_df <- data.frame(t = boot_knots$t) %>%
mutate(within_ci = (t >= boot_knot_ci$percent[4] & t <= boot_knot_ci$percent[5]))
bootknot_ci_p <- ggplot(data = boot_df, aes(x = t, y = ..count../sum(..count..), fill = within_ci)) +
geom_bar() +
theme_minimal() +
labs(x = "Selected number of knots",
y = "Frequency",
fill = "Within CI")
ggsave(paste0(FIGUREPATH, response, "_rcs_boot_knot_distr.pdf"), bootknot_ci_p, width = 8, height = 6, device = cairo_pdf)
bootknot_ci_pSelect optimum number of knots via bootstrapping and AIC
selected_knot_n <- boot_knots$t0
# Final RCS fit
rcs_fit <- ols( as.formula(sprintf("%s ~ rcs(Ferritin, %s)", response, selected_knot_n)), data = data, x = T, y = T)
saveRDS(rcs_fit, paste0(MODELPATH, "rcs_final.rds"))Bootstrap threshold
Now use selected knot number to bootstrap saddle estimates
Find estimates for saddle point via bootstrapping
# The function boot_saddle is defined in functions.R
if (!file.exists(paste0(BOOTPATH, response, "_rcs_saddle_", boot_n, "_bootobj.rds"))) {
boot_saddle <- boot(data = data, statistic = boot_saddle_estimate, R = boot_n, response = response, knot_n = selected_knot_n)
invalid_indices <- which(boot_saddle$t == 0)
boot_saddle_ci <- boot.ci(boot_saddle, type = "perc", exclude = invalid_indices)
saveRDS(boot_saddle, paste0(BOOTPATH, response, "_rcs_saddle_", boot_n, "_bootobj.rds"))
saveRDS(boot_saddle_ci, paste0(BOOTPATH, response, "_rcs_saddle_ci_", boot_n, "_boot_ci.rds"))
} else {
boot_saddle <- readRDS(paste0(BOOTPATH, response, "_rcs_saddle_", boot_n, "_bootobj.rds"))
boot_saddle_ci <- readRDS(paste0(BOOTPATH, response, "_rcs_saddle_ci_", boot_n, "_boot_ci.rds"))
invalid_indices <- which(boot_saddle$t == 0)
}
paste0("Bootstrapped saddle point estimate is computable in ", round((1 - length(invalid_indices)/length(boot_saddle$t)) * 100, 2), "% of iterations.")[1] "Bootstrapped saddle point estimate is computable in 99.9% of iterations."
Find estimates for saddle point via bootstrapping
rcs_final_p <- plot_rcs(data, rcs_fit, n_knot = selected_knot_n, zoom = T, FIGUREPATH = NULL) +
annotate('rect', xmin = boot_saddle_ci$percent[4], xmax = boot_saddle_ci$percent[5], ymin = -Inf, ymax = Inf, alpha = 0.2, fill = 'black') +
labs(subtitle = paste0("Saddle point: ", round(boot_saddle_ci$t0, 2), " μg/L [", round(boot_saddle_ci$percent[4], 2), ", ", round(boot_saddle_ci$percent[5], 2), "]"))
ggsave(paste0(FIGUREPATH, response, "_rcs_final.pdf"), rcs_final_p, width = 8, height = 6, device = cairo_pdf)
rcs_final_pFind estimates for saddle point via bootstrapping
rcs_final_saddle_density_p <-
ggplot(data = data.frame(t = boot_saddle$t), aes(x = t)) +
geom_density() +
theme_minimal() +
labs(x = "Saddle point",
y = "Density")
ggsave(paste0(FIGUREPATH, response, "_rcs_final_saddle_density.pdf"), rcs_final_saddle_density_p, width = 8, height = 6, device = cairo_pdf)
rcs_final_saddle_density_pBayesian breakpoint analysis
We limit our data to here to Ferritin <= 100, otherwise the 2 breakpoint version will place the second breakpoint way beyond it. This decision is a source of bias.
1 breakpoint
Bayesian breakpoint analysis, 1 breakpoint
concat_data <- data %>%
filter(Ferritin <= 100)
if (!file.exists(paste0(MODELPATH, response, "_bayesian_1bp.rds"))) {
# Specify model
if (response == "Hemoglobin") {
baye_1bp_spec <- list(Hemoglobin ~ Ferritin,
~ 0 + Ferritin)
}
if (response == "sTfR") {
baye_1bp_spec <- list(sTfR ~ Ferritin,
~ 0 + Ferritin)
}
# Fit
baye_1bp_fit <- mcp(baye_1bp_spec,
data = concat_data,
adapt = 500,
iter = 500,
chains = 4,
cores = 4)
saveRDS(baye_1bp_fit, paste0(MODELPATH, response, "_bayesian_1bp.rds"))
} else {
baye_1bp_fit <- readRDS(paste0(MODELPATH, response, "_bayesian_1bp.rds"))
}
summary(baye_1bp_fit)Family: gaussian(link = 'identity')
Iterations: 2000 from 4 chains.
Segments:
1: Hemoglobin ~ Ferritin
2: Hemoglobin ~ 1 ~ 0 + Ferritin
Population-level parameters:
name mean lower upper Rhat n.eff
Ferritin_1 0.2094 0.1908 0.2303 1.1 10
Ferritin_2 0.0074 0.0062 0.0088 1.0 323
cp_1 14.7588 14.0805 15.5867 1.1 39
int_1 10.1398 9.9456 10.3195 1.1 19
sigma_1 1.0115 0.9927 1.0304 1.0 1200
Bayesian breakpoint analysis, 1 breakpoint
# Plot
bay1_p <- plot(baye_1bp_fit) +
theme_minimal() +
coord_cartesian(xlim = c(0, 100))
posterior_check_p <- pp_check(baye_1bp_fit)
convergence_check_p <- plot_pars(baye_1bp_fit, regex_pars = "cp_")
ggsave(paste0(FIGUREPATH, response, "_bay1bp.pdf"), bay1_p, width = 8, height = 6, device = cairo_pdf)
ggsave(paste0(FIGUREPATH, response, "_bay1bp_ppcheck.pdf"), posterior_check_p, width = 8, height = 6, device = cairo_pdf)
ggsave(paste0(FIGUREPATH, response, "_bay1bp_convcheck.pdf"), convergence_check_p, width = 8, height = 6, device = cairo_pdf)
bay1_pBayesian breakpoint analysis, 1 breakpoint
posterior_check_pBayesian breakpoint analysis, 1 breakpoint
convergence_check_p2 breakpoints
Bayesian breakpoint analysis, 2 breakpoints
if (!file.exists(paste0(MODELPATH, response, "_bayesian_2bp.rds"))) {
# Specify model
if (response == "Hemoglobin") {
baye_2bp_spec <- list(Hemoglobin ~ Ferritin,
~ 0 + Ferritin,
~ 0 + Ferritin)
}
if (response == "sTfR") {
baye_2bp_spec <- list(sTfR ~ Ferritin,
~ 0 + Ferritin,
~ 0 + Ferritin)
}
# Fit
baye_2bp_fit <- mcp(baye_2bp_spec,
data = concat_data,
adapt = 500,
iter = 500,
chains = 4,
cores = 4)
saveRDS(baye_2bp_fit, paste0(MODELPATH, response, "_bayesian_2bp.rds"))
} else {
baye_2bp_fit <- readRDS(paste0(MODELPATH, response, "_bayesian_2bp.rds"))
}
summary(baye_2bp_fit)Family: gaussian(link = 'identity')
Iterations: 2000 from 4 chains.
Segments:
1: Hemoglobin ~ Ferritin
2: Hemoglobin ~ 1 ~ 0 + Ferritin
3: Hemoglobin ~ 1 ~ 0 + Ferritin
Population-level parameters:
name mean lower upper Rhat n.eff
Ferritin_1 1.0863 0.5739 1.5087 10.5 9
Ferritin_2 0.1500 0.1193 0.1761 2.0 21
Ferritin_3 0.0068 0.0051 0.0082 1.0 230
cp_1 4.4437 3.4641 6.0420 6.1 15
cp_2 16.9235 15.6629 18.8014 1.4 30
int_1 6.8142 5.7630 8.3873 8.5 13
sigma_1 1.0052 0.9867 1.0239 1.0 1642
Bayesian breakpoint analysis, 2 breakpoints
# Plot
bay2_p <- plot(baye_2bp_fit) +
theme_minimal() +
coord_cartesian(xlim = c(0, 100))
posterior_check_p <- pp_check(baye_2bp_fit)
convergence_check_p <- plot_pars(baye_2bp_fit, regex_pars = "cp_")
ggsave(paste0(FIGUREPATH, response, "_bay2bp.pdf"), bay2_p, width = 8, height = 6, device = cairo_pdf)
ggsave(paste0(FIGUREPATH, response, "_bay2bp.pdf"), posterior_check_p, width = 8, height = 6, device = cairo_pdf)
ggsave(paste0(FIGUREPATH, response, "_bay2bp.pdf"), convergence_check_p, width = 8, height = 6, device = cairo_pdf)
bay2_pBayesian breakpoint analysis, 2 breakpoints
posterior_check_pBayesian breakpoint analysis, 2 breakpoints
convergence_check_pPlay with scam
scamfit <- scam(Hemoglobin ~ s(Ferritin, k = 8, bs = "mpi"), data = concat_data)
plot(scamfit, residuals = T)From the supplement of Galetti et al. (2021):
To identify thresholds, we then used the method of finite differences to estimate the first derivatives of the fitted trend spline from the generalized additive modelling smoother and we identified the periods in which the estimated rate of change between fitted y and x was statistically significant (i.e. statistically different from zero) by computing the uncertainty around the estimates from 10 000 posterior simulations. At the point on the x-axis where 95% of the posterior simulations are all different from zero for the first derivative, we could identify where the slope between fitted y and x began to be signiicantly positive (or negative).
The issue currently is that we are deliberately choosing to shape restrict the fit to a monotonically increasing curve \(\rightarrow\) the derivative can never cross zero, so we’d have to choose an arbitrary number “close to zero” instead, which will be difficult to justify.
d1 <- derivative.scam(scamfit)
xx <- sort(concat_data$Ferritin, index = TRUE)
plot(xx$x, d1$d[xx$ix], type = "l")